import numpy as np
from numpy import random
import pandas as pd
from scipy import stats
from scipy.stats import norm #normal
from scipy.stats import genextreme as gev #generalized extreme value
from scipy.stats import pareto #pareto
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')np.random.seed(613)
pass_fail = stats.bernoulli.rvs(p = 0.7, size = 100)
pass_failarray([0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1])
Probability of Success = p
round(sum(pass_fail) / len(pass_fail), 2)0.62
Probabilty of Failure = q = 1 - p
round(1 - (sum(pass_fail) / len(pass_fail)), 2)0.38
pass_fail = np.array(pass_fail, dtype=str)
sns.histplot(pass_fail,
color = 'darkblue')
plt.show()grade = np.random.choice(np.array(['A','B','C','F']), 100, p=[0.15,0.55,0.2,0.1])
gradearray(['B', 'C', 'B', 'C', 'B', 'A', 'A', 'B', 'F', 'C', 'A', 'B', 'B',
'B', 'B', 'C', 'B', 'B', 'A', 'B', 'A', 'A', 'B', 'B', 'B', 'B',
'A', 'B', 'A', 'A', 'B', 'F', 'C', 'B', 'B', 'B', 'F', 'B', 'C',
'A', 'B', 'B', 'C', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'B', 'B',
'A', 'A', 'A', 'C', 'B', 'F', 'B', 'B', 'B', 'B', 'B', 'C', 'B',
'A', 'A', 'F', 'A', 'B', 'B', 'C', 'B', 'B', 'B', 'B', 'B', 'B',
'B', 'B', 'B', 'B', 'B', 'F', 'F', 'A', 'A', 'C', 'B', 'B', 'A',
'B', 'B', 'B', 'B', 'B', 'A', 'B', 'B', 'B'], dtype='<U1')
P = {}
P["A"] = sum(grade == "A") / len(grade)
P["B"] = sum(grade == "B") / len(grade)
P["C"] = sum(grade == "C") / len(grade)
P["F"] = sum(grade == "F") / len(grade)
P{'A': 0.2, 'B': 0.61, 'C': 0.12, 'F': 0.07}
grade.sort()
sns.histplot(grade,
color = 'darkblue')
plt.show()score = np.random.normal(loc=60, scale=15, size=100)
scorearray([49.42178966, 60.11318944, 57.52655737, 55.18500198, 83.58669807,
60.60611116, 62.84955862, 91.88147056, 58.18561255, 44.43607505,
56.52203474, 50.3039921 , 73.73464322, 70.39393145, 71.64669984,
68.42587924, 28.40244391, 38.53671815, 66.48988439, 63.57931322,
52.35943507, 51.69438108, 72.61734787, 68.00083258, 32.2147696 ,
62.812808 , 90.66467772, 40.40392912, 80.68296546, 54.57851271,
52.43785367, 49.71713692, 44.48677425, 34.22687865, 79.11074983,
33.04023258, 55.9149938 , 63.05036148, 78.04091144, 46.51584864,
59.93718497, 73.32257878, 52.59278977, 76.8084071 , 74.4950962 ,
67.11017229, 55.39331216, 56.46019432, 56.90356894, 43.26492979,
73.47407378, 55.42948985, 60.05844776, 36.53838227, 55.98379844,
63.00929694, 68.01131318, 70.25352878, 71.27606349, 54.99293147,
75.80943182, 47.86434823, 44.67846184, 45.20892105, 54.01298278,
37.86224058, 70.24213513, 64.20159803, 88.30329909, 38.15488611,
35.13638258, 54.26145695, 48.48772986, 74.51270407, 53.70828047,
78.23249199, 69.77339593, 62.31208649, 44.30688124, 79.06567921,
67.89160877, 51.87449517, 17.56268311, 49.42229534, 56.74312746,
63.24380317, 45.36827356, 59.07366461, 48.74952759, 74.39034928,
89.44170169, 67.96653877, 84.11970041, 61.24483127, 62.50491433,
56.72530542, 26.22567409, 63.09046925, 67.84436935, 70.26057196])
The probability of a specific outcome are extremely low because there are an infinite number of possible outcomes:
sum(score == 80.0) / len(score)0.0
Instead, look at higher, lower, or a range:
sum((score > 75) & (score < 85)) / len(score)0.09
sns.histplot(score,
kde = True,
bins = 20,
color = 'darkblue')
plt.show()A single binary event is a Bernoulli distribution, a collection of Bernoulli events is a binomial distribution.
If true, then for n trials and r successes:
#Eg. filliping a coin 10 times, 10000 times
trials = []
for i in range(0, 10000):
sample = stats.bernoulli.rvs(p = 0.5, size = 10)
trials.append(sum(sample))
print(trials[0:100])[4, 4, 7, 3, 3, 5, 8, 5, 5, 4, 4, 6, 6, 5, 4, 1, 4, 5, 4, 6, 5, 6, 4, 5, 3, 5, 6, 6, 7, 2, 5, 4, 7, 3, 4, 5, 6, 5, 6, 4, 4, 7, 5, 7, 3, 5, 6, 5, 4, 2, 3, 3, 3, 5, 4, 4, 6, 4, 3, 6, 6, 6, 5, 4, 7, 3, 5, 4, 5, 8, 4, 6, 5, 2, 4, 5, 5, 4, 2, 6, 7, 5, 4, 2, 7, 7, 6, 7, 4, 6, 2, 5, 5, 6, 6, 5, 5, 6, 6, 9]
#Expected Mean = n * p = 10 * 0.5 = 5
np.mean(trials)4.9915
#Expected Standard Deviation = sqrt(n * p * (1 - p)) = sqrt(10 * 0.5 * (1 - 0.5)) = 1.58
np.std(trials)1.5848115818607587
sns.histplot(trials,
color = 'darkblue',
bins = 10)
plt.axvline(np.mean(trials), color = 'r', ls = '--', lw = 2.0)
plt.axvline(np.mean(trials) - np.std(trials), color = 'g', ls = '--', lw = 2.0)
plt.axvline(np.mean(trials) + np.std(trials), color = 'g', ls = '--', lw = 2.0)
plt.show()If true, then x given λ events in the past:
#Eg. 25 hurricanes in the past 22 years
lam = 25.0 / 22
distribution = stats.poisson.pmf(range(0, 11), mu = lam)sns.barplot(x = [0,1,2,3,4,5,6,7,8,9,10],
y = distribution,
color = 'darkblue')
plt.axvline(lam, color = 'r', ls = '--', lw = 2.0)
plt.axvline(lam - np.sqrt(lam), color = 'g', ls = '--', lw = 2.0)
plt.axvline(lam + np.sqrt(lam), color = 'g', ls = '--', lw = 2.0)
plt.show()#Probability of only 1
stats.poisson.pmf(1, mu = lam)0.364754678578128
#Probability of less than 3
sum(stats.poisson.pmf(range(0, 3), mu = lam))0.892985772191726
#Probability of more than 2
1 - sum(stats.poisson.pmf(range(0, 2), mu = lam))0.31426120427311943
x = np.array(range(-500,501)) * 0.01
plt.plot(x, stats.norm.pdf(x, 0, 1))
plt.title('PDF (probability density function) of a standard normal distribution')
plt.show()plt.plot(x,stats.norm.cdf(x, 0, 1))
plt.title('CDF (cumulative distribution function) of a standard normal distribution')
plt.show()Impact of σ
x = np.array(range(0, 2000)) * 0.01
mu = 10
fig, ax = plt.subplots(figsize=(7.5, 5))
for sigma in range(1,6):
ax.plot(x, stats.norm.pdf(x, mu, sigma),label='sigma={0}'.format(sigma))
ax.axvline(mu, color = 'r', ls = '--', lw = 2.0)
ax.legend()
ax.set_title('PDF of a normal distribution')fig, ax = plt.subplots(figsize=(7.5, 5))
for sigma in range(1,6):
ax.plot(x,stats.norm.cdf(x, mu, sigma), label = 'sigma={0}'.format(sigma))
ax.legend()
ax.set_title('CDF of a normal distribution')Percentiles
\(Z=\frac{x-\mu}{\sigma}\)
Example
sample = np.random.normal(loc=60, scale=15, size=1000)
sample.mean()59.63032031988109
sample.std()15.717177638571053
sns.histplot(sample,
bins = 20,
color = 'blue',
stat='density')
sns.kdeplot(sample,
color = 'darkblue',
linewidth = 4)
plt.axvline(sample.mean(), color = 'r', ls = '--', lw = 2.0)
plt.axvline(sample.mean() - sample.std(), color = 'g', ls = '--', lw = 2.0)
plt.axvline(sample.mean() + sample.std(), color = 'g', ls = '--', lw = 2.0)
plt.show()# Above Mean, Right: Probability > 80
1 - norm.cdf(80, loc = sample.mean(), scale = sample.std())0.09748535909394129
# Above Mean, Left: Probability < 80
norm.cdf(80, loc = sample.mean(), scale = sample.std())0.9025146409060587
# Below Mean, Left: Probability < 40
norm.cdf(40, loc = sample.mean(), scale = sample.std())0.10583759323323055
# Below Mean, Right: Probability > 40
1 - norm.cdf(40, loc = sample.mean(), scale = sample.std())0.8941624067667695
# Middle: Probability 40 < x < 80
norm.cdf(80, loc = sample.mean(), scale = sample.std()) - norm.cdf(40, loc = sample.mean(), scale = sample.std())0.7966770476728282
def distribution_analysis(x, log_scale = False, fit_distribution = 'None', bins = 50, vis_means = True):
#x - array of observations
#log_scale - analyze distribution of log(x) if True
#fit_distribution - fit the distribution ('normal', 'gev' or 'pareto') or do nothing if 'None'
#bins - how many bins to use for binning the data
#vis_means - show mean and std lines if True
if log_scale:
x1 = np.log10(x) #convert data to decimal logarithms
xlabel = 'log(values)' #reflect in x labels
else:
x1 = x #leave original scale
xlabel = 'values'
mu = x1.mean() #compute the mean
if log_scale: #if logscale, output log mean, its original scale, and original scale mean
print('Log mean = {:.2f}({:.2f}), mean = {:.2f}'.format(mu, 10**mu, x.mean()))
else:
print('Mean = {:.2f}'.format(mu)) #otherwise print mean
sigma = x1.std() #compute and output standard deviation
print('Standard deviation = {:.2f}'.format(sigma))
#visualize histogram and the interpolated line using seaborn
sns.histplot(x1, bins = bins, color = 'darkblue', stat = 'density')
sns.kdeplot(x1, color = 'darkblue', linewidth = 4)
#show vertical lines for mean and std if vis_means = True
if vis_means:
plt.axvline(mu, color = 'r', ls = '--', lw=2.0)
plt.axvline(mu-sigma, color = 'g', ls = '--', lw = 2.0)
plt.axvline(mu+sigma, color = 'g', ls = '--', lw = 2.0)
ylim = plt.gca().get_ylim() #keep the y-range of original distribution density values
#(to make sure the fitted distribution would not affect it)
h = np.arange(mu - 3 * sigma, mu + 3 * sigma, sigma / 100) #3-sigma visualization range for the fitted distribution
pars = None #fitted distribution parameters
#fit and visualize the theoretic distribution
if fit_distribution == 'normal':
pars = norm.fit(x1)
plt.plot(h,norm.pdf(h,*pars),'r')
elif fit_distribution == 'gev':
pars = gev.fit(x1)
plt.plot(h,gev.pdf(h,*pars),'r')
elif fit_distribution == 'pareto':
pars = pareto.fit(x1)
plt.plot(h,pareto.pdf(h,*pars),'r')
plt.xlabel(xlabel) #add x label
plt.ylim(ylim) #restore the y-range of original distribution density values
plt.show()distribution_analysis(sample)Mean = 59.63
Standard deviation = 15.72
distribution_analysis(sample, fit_distribution = "normal")Mean = 59.63
Standard deviation = 15.72
distribution_analysis(sample, fit_distribution = "pareto")Mean = 59.63
Standard deviation = 15.72
from gapminder import gapminder
asia_07 = gapminder[(gapminder["continent"] == "Asia") & (gapminder["year"] == 2007)]["lifeExp"].sort_values()
africa_07 = gapminder[(gapminder["continent"] == "Africa") & (gapminder["year"] == 2007)]["lifeExp"].sort_values()def is_normal(x, sig=0.05):
#check is the distribution is normal using one-sample KS test and sample mean-std
test_result = stats.kstest(x,'norm', args=(x.mean(),x. std()))
if test_result[1] < sig:
print("Reject the null (p = " + str(round(test_result[1], 3)) + "), can't consider normal")
else:
print("Fail to reject the null (p = " + str(round(test_result[1], 3)) + "), safe to assume normal")is_normal(asia_07)Fail to reject the null (p = 0.324), safe to assume normal
is_normal(africa_07)Fail to reject the null (p = 0.691), safe to assume normal
sns.histplot(asia_07,
bins = 20,
color = 'blue',
stat='density',
label = "Asia")
sns.kdeplot(asia_07,
color = 'darkblue',
linewidth = 4)
sns.histplot(africa_07,
bins = 20,
color = 'red',
stat='density',
label = "Africa")
sns.kdeplot(africa_07,
color = 'darkred',
linewidth = 4)
plt.legend()
plt.show()stats.ttest_ind(asia_07, africa_07)Ttest_indResult(statistic=7.9273954920841065, pvalue=9.074251950172207e-12)
from statsmodels.distributions.empirical_distribution import ECDF
plt.plot(asia_07,
stats.norm.cdf(asia_07, asia_07.mean(), asia_07.std()),
color = 'blue',
label = "Asia")
asia_ecdf = ECDF(asia_07)
plt.plot(asia_ecdf.x, asia_ecdf.y, color = "lightblue")
plt.plot(africa_07,
stats.norm.cdf(africa_07, africa_07.mean(), africa_07.std()),
color = 'red',
label = "Africa")
africa_ecdf = ECDF(africa_07)
plt.plot(africa_ecdf.x, africa_ecdf.y, color = "salmon")
plt.legend()
plt.show()stats.ks_2samp(asia_07, africa_07)KstestResult(statistic=0.7389277389277389, pvalue=2.5241475576365247e-11)